# All the libralies used in this project
# General Libraries
import numpy as np
import pandas as pd
import pathlib
# Visualisation Libraries
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import matplotlib as mpl
import matplotlib.pylab as pylab
# Modelling Preprocessing
from sklearn.preprocessing import LabelEncoder # OneHotEncoder,
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
# Modelling Regression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error # mean_squared_log_error,
from sklearn.linear_model import LinearRegression, Ridge, Lasso # , RidgeCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor # , BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.svm import SVR
This dataset contains the prices and other attributes of almost 54,000 diamonds. There are 10 attributes included in the dataset including the target ie. price.
Feature description:
price
price in US dollars ($326 -- $18,823)This is the target column containing tags for the features.
The 4 Cs of Diamonds:-
carat (0.2--5.01) The carat is the diamond’s physical weight measured in metric carats. One carat equals 1/5 gram and is subdivided into 100 points. Carat weight is the most objective grade of the 4Cs.
cut (Fair, Good, Very Good, Premium, Ideal) In determining the quality of the cut, the diamond grader evaluates the cutter’s skill in the fashioning of the diamond. The more precise the diamond is cut, the more captivating the diamond is to the eye.
color, from J (worst) to D (best) The colour of gem-quality diamonds occurs in many hues. In the range from colourless to light yellow or light brown. Colourless diamonds are the rarest. Other natural colours (blue, red, pink for example) are known as "fancy,” and their colour grading is different than from white colorless diamonds.
clarity (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)) Diamonds can have internal characteristics known as inclusions or external characteristics known as blemishes. Diamonds without inclusions or blemishes are rare; however, most characteristics can only be seen with magnification.
Dimensions
x length in mm (0--10.74)
y width in mm (0--58.9)
z depth in mm (0--31.8)

depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79) The depth of the diamond is its height (in millimetres) measured from the culet (bottom tip) to the table (flat, top surface).
table width of the top of the diamond relative to widest point (43--95)
A diamond's table refers to the flat facet of the diamond seen when the stone is face up. The main purpose of a diamond table is to refract entering light rays and allow reflected light rays from within the diamond to meet the observer’s eye. The ideal table cut diamond will give the diamond stunning fire and brilliance.
df_diamond = pd.read_csv('diamonds.csv', delimiter=',')
df_diamond
| Unnamed: 0 | carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 2 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 3 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 4 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 5 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53935 | 53936 | 0.72 | Ideal | D | SI1 | 60.8 | 57.0 | 2757 | 5.75 | 5.76 | 3.50 |
| 53936 | 53937 | 0.72 | Good | D | SI1 | 63.1 | 55.0 | 2757 | 5.69 | 5.75 | 3.61 |
| 53937 | 53938 | 0.70 | Very Good | D | SI1 | 62.8 | 60.0 | 2757 | 5.66 | 5.68 | 3.56 |
| 53938 | 53939 | 0.86 | Premium | H | SI2 | 61.0 | 58.0 | 2757 | 6.15 | 6.12 | 3.74 |
| 53939 | 53940 | 0.75 | Ideal | D | SI2 | 62.2 | 55.0 | 2757 | 5.83 | 5.87 | 3.64 |
53940 rows × 11 columns
Schritte der Datenvorverarbeitung
# Delete the Unnamed column (Old Index)
df_diamond = df_diamond.drop(["Unnamed: 0"], axis = 1)
df_diamond.head()
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
# Get the general information about the source data.
df_diamond.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 53940 entries, 0 to 53939 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 carat 53940 non-null float64 1 cut 53940 non-null object 2 color 53940 non-null object 3 clarity 53940 non-null object 4 depth 53940 non-null float64 5 table 53940 non-null float64 6 price 53940 non-null int64 7 x 53940 non-null float64 8 y 53940 non-null float64 9 z 53940 non-null float64 dtypes: float64(6), int64(1), object(3) memory usage: 4.1+ MB
# Check if the data have null values / No null values
df_diamond.isnull().sum()
carat 0 cut 0 color 0 clarity 0 depth 0 table 0 price 0 x 0 y 0 z 0 dtype: int64
# Check if the data have NA values / No NA values
df_diamond.isna().sum()
carat 0 cut 0 color 0 clarity 0 depth 0 table 0 price 0 x 0 y 0 z 0 dtype: int64
# Check if the data have duplicate values
df_diamond.loc[df_diamond.duplicated() == True]
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1005 | 0.79 | Ideal | G | SI1 | 62.3 | 57.0 | 2898 | 5.90 | 5.85 | 3.66 |
| 1006 | 0.79 | Ideal | G | SI1 | 62.3 | 57.0 | 2898 | 5.90 | 5.85 | 3.66 |
| 1007 | 0.79 | Ideal | G | SI1 | 62.3 | 57.0 | 2898 | 5.90 | 5.85 | 3.66 |
| 1008 | 0.79 | Ideal | G | SI1 | 62.3 | 57.0 | 2898 | 5.90 | 5.85 | 3.66 |
| 2025 | 1.52 | Good | E | I1 | 57.3 | 58.0 | 3105 | 7.53 | 7.42 | 4.28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 47969 | 0.52 | Ideal | D | VS2 | 61.8 | 55.0 | 1919 | 5.19 | 5.16 | 3.20 |
| 49326 | 0.51 | Ideal | F | VVS2 | 61.2 | 56.0 | 2093 | 5.17 | 5.19 | 3.17 |
| 49557 | 0.71 | Good | F | SI2 | 64.1 | 60.0 | 2130 | 0.00 | 0.00 | 0.00 |
| 50079 | 0.51 | Ideal | F | VVS2 | 61.2 | 56.0 | 2203 | 5.19 | 5.17 | 3.17 |
| 52861 | 0.50 | Fair | E | VS2 | 79.0 | 73.0 | 2579 | 5.21 | 5.18 | 4.09 |
146 rows × 10 columns
# Delete the duplicate rows.
df_diamond.drop_duplicates(keep="first", inplace=True)
df_diamond.reset_index(inplace=True, drop=True)
# Check if the delete is successful
df_diamond.loc[df_diamond.duplicated() == True]
| carat | cut | color | clarity | depth | table | price | x | y | z |
|---|
# Check the data for changes
df_diamond.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 53794 entries, 0 to 53793 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 carat 53794 non-null float64 1 cut 53794 non-null object 2 color 53794 non-null object 3 clarity 53794 non-null object 4 depth 53794 non-null float64 5 table 53794 non-null float64 6 price 53794 non-null int64 7 x 53794 non-null float64 8 y 53794 non-null float64 9 z 53794 non-null float64 dtypes: float64(6), int64(1), object(3) memory usage: 4.1+ MB
# Find the dimentionless diamonds (0 length dimentions)
df_diamond.loc[(df_diamond.x == 0) | (df_diamond.y == 0) | (df_diamond.z == 0), :]
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2201 | 1.00 | Premium | G | SI2 | 59.1 | 59.0 | 3142 | 6.55 | 6.48 | 0.0 |
| 2308 | 1.01 | Premium | H | I1 | 58.1 | 59.0 | 3167 | 6.66 | 6.60 | 0.0 |
| 4778 | 1.10 | Premium | G | SI2 | 63.0 | 59.0 | 3696 | 6.50 | 6.47 | 0.0 |
| 5457 | 1.01 | Premium | F | SI2 | 59.2 | 58.0 | 3837 | 6.50 | 6.47 | 0.0 |
| 10145 | 1.50 | Good | G | I1 | 64.0 | 61.0 | 4731 | 7.15 | 7.04 | 0.0 |
| 11156 | 1.07 | Ideal | F | SI2 | 61.6 | 56.0 | 4954 | 0.00 | 6.62 | 0.0 |
| 11935 | 1.00 | Very Good | H | VS2 | 63.3 | 53.0 | 5139 | 0.00 | 0.00 | 0.0 |
| 13570 | 1.15 | Ideal | G | VS2 | 59.2 | 56.0 | 5564 | 6.88 | 6.83 | 0.0 |
| 15914 | 1.14 | Fair | G | VS1 | 57.5 | 67.0 | 6381 | 0.00 | 0.00 | 0.0 |
| 24338 | 2.18 | Premium | H | SI2 | 59.4 | 61.0 | 12631 | 8.49 | 8.45 | 0.0 |
| 24464 | 1.56 | Ideal | G | VS2 | 62.2 | 54.0 | 12800 | 0.00 | 0.00 | 0.0 |
| 26063 | 2.25 | Premium | I | SI1 | 61.3 | 58.0 | 15397 | 8.52 | 8.42 | 0.0 |
| 26183 | 1.20 | Premium | D | VVS1 | 62.1 | 59.0 | 15686 | 0.00 | 0.00 | 0.0 |
| 27047 | 2.20 | Premium | H | SI1 | 61.2 | 59.0 | 17265 | 8.42 | 8.37 | 0.0 |
| 27364 | 2.25 | Premium | H | SI2 | 62.8 | 59.0 | 18034 | 0.00 | 0.00 | 0.0 |
| 27438 | 2.02 | Premium | H | VS2 | 62.7 | 53.0 | 18207 | 8.02 | 7.95 | 0.0 |
| 27672 | 2.80 | Good | G | SI2 | 63.8 | 58.0 | 18788 | 8.90 | 8.85 | 0.0 |
| 49413 | 0.71 | Good | F | SI2 | 64.1 | 60.0 | 2130 | 0.00 | 0.00 | 0.0 |
| 51361 | 1.12 | Premium | G | I1 | 60.4 | 59.0 | 2383 | 6.71 | 6.67 | 0.0 |
#Delete dimentionless diamonds
df_diamond = df_diamond.drop(df_diamond[df_diamond["x"]==0].index)
df_diamond = df_diamond.drop(df_diamond[df_diamond["y"]==0].index)
df_diamond = df_diamond.drop(df_diamond[df_diamond["z"]==0].index)
df_diamond.reset_index(drop=True)
df_diamond.shape
(53775, 10)
# Check if the delete for dimentionless diamonds successful
df_diamond.loc[(df_diamond.x == 0) | (df_diamond.y == 0) | (df_diamond.z == 0), :]
| carat | cut | color | clarity | depth | table | price | x | y | z |
|---|
# See the descriptive specifications of the data
df_diamond.describe()
| carat | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|
| count | 53775.000000 | 53775.000000 | 53775.000000 | 53775.000000 | 53775.000000 | 53775.000000 | 53775.000000 |
| mean | 0.797536 | 61.748232 | 57.457806 | 3931.220288 | 5.731579 | 5.734909 | 3.539964 |
| std | 0.473169 | 1.429639 | 2.233257 | 3985.918716 | 1.118611 | 1.139451 | 0.702017 |
| min | 0.200000 | 43.000000 | 43.000000 | 326.000000 | 3.730000 | 3.680000 | 1.070000 |
| 25% | 0.400000 | 61.000000 | 56.000000 | 951.000000 | 4.710000 | 4.720000 | 2.910000 |
| 50% | 0.700000 | 61.800000 | 57.000000 | 2401.000000 | 5.700000 | 5.710000 | 3.530000 |
| 75% | 1.040000 | 62.500000 | 59.000000 | 5324.000000 | 6.540000 | 6.540000 | 4.030000 |
| max | 5.010000 | 79.000000 | 95.000000 | 18823.000000 | 10.740000 | 58.900000 | 31.800000 |
## Data distribution analysis
#Review of numeric and categorical data distibutions and boxplots to visualize outliers.
# plot distribution of numeric variables
int_cols = df_diamond.select_dtypes(exclude='object').columns.to_list() #choose all but object columns
j=0
fig=plt.figure(figsize=(15,10))
plt.suptitle("Numeric variables: Distribution", size=20, weight='bold')
for i in int_cols:
ax=plt.subplot(331+j)
ax=sns.kdeplot(data=df_diamond, x=i, fill=True, edgecolor="black", alpha=0.8)
j=j+1
# plot boxplot of numeric variables to visualize outliers
fig=plt.figure(figsize=(15,10))
plt.suptitle("Numeric variables: Box plots", size=20, weight='bold')
j=0
for i in int_cols:
ax=plt.subplot(331+j)
ax=sns.boxplot(data=df_diamond, x=i)
j=j+1
# visualize distribution of categorical variables
cat_cols=df_diamond.select_dtypes(include='object').columns.to_list()
fig=plt.figure(figsize=(15,5))
plt.suptitle("Categorical variables: Distribution", size=20, weight='bold')
ax1=plt.subplot(131)
sns.countplot(data=df_diamond, x=cat_cols[0], ax=ax1, edgecolor="black")
for s in ['left','right','top','bottom']:
ax1.spines[s].set_visible(False)
ax2=plt.subplot(132, sharey=ax1)
sns.countplot(data=df_diamond, x=cat_cols[1], ax=ax2, edgecolor="black")
for s in ['left','right','top','bottom']:
ax2.spines[s].set_visible(False)
ax3=plt.subplot(133, sharey=ax1)
sns.countplot(data=df_diamond, x=cat_cols[2], ax=ax3, edgecolor="black")
for s in ['left','right','top','bottom']:
ax3.spines[s].set_visible(False)
## Outliers
# According to IQR method, outliers are defined by IQR (distance between Q1 and Q3).
# To build the fence between outliers and non outliers, we take 1.5 times the IQR
# and then subtract this value from Q1 and add this value to Q3. This gives us
# the minimum and maximum fence posts that we compare each observation to.
# Any observations that are more than 1.5 IQR below Q1 or
# more than 1.5 IQR above Q3 are considered outliers.
#If we use the IQR method on our diamond dataset, we remove ca. 12% of our data.
import scipy.stats as st
Q1 = df_diamond.quantile(0.25)
Q3 = df_diamond.quantile(0.75)
IQR=Q3-Q1
df_diamond_wo_outliers=df_diamond[~((df_diamond<(Q1-1.5*IQR))|(df_diamond>(Q3+1.5*IQR))).any(axis=1)]
int_cols=df_diamond_wo_outliers.select_dtypes(exclude='object').columns.to_list()
j=0
fig=plt.figure(figsize=(15,10))
plt.suptitle("Numeric variables after removing the outliers", size=20, weight='bold')
for i in int_cols:
ax=plt.subplot(331+j)
ax=sns.boxplot(data=df_diamond_wo_outliers, x=i)
j=j+1
df_diamond_wo_outliers.info()
df_diamond.info()
C:\Users\Dell\AppData\Local\Temp\ipykernel_10856\823167053.py:15: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right` df_diamond_wo_outliers=df_diamond[~((df_diamond<(Q1-1.5*IQR))|(df_diamond>(Q3+1.5*IQR))).any(axis=1)]
<class 'pandas.core.frame.DataFrame'> Int64Index: 47412 entries, 0 to 53793 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 carat 47412 non-null float64 1 cut 47412 non-null object 2 color 47412 non-null object 3 clarity 47412 non-null object 4 depth 47412 non-null float64 5 table 47412 non-null float64 6 price 47412 non-null int64 7 x 47412 non-null float64 8 y 47412 non-null float64 9 z 47412 non-null float64 dtypes: float64(6), int64(1), object(3) memory usage: 4.0+ MB <class 'pandas.core.frame.DataFrame'> Int64Index: 53775 entries, 0 to 53793 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 carat 53775 non-null float64 1 cut 53775 non-null object 2 color 53775 non-null object 3 clarity 53775 non-null object 4 depth 53775 non-null float64 5 table 53775 non-null float64 6 price 53775 non-null int64 7 x 53775 non-null float64 8 y 53775 non-null float64 9 z 53775 non-null float64 dtypes: float64(6), int64(1), object(3) memory usage: 6.5+ MB
# Outliers part 2
"""
We found that using a 1.5 IQR removes too many data, which we would need to build a reliable model.
Therefore, we check the data distribution again and decide on the cutoff for each property separately.
X has outliers, however they do not seem unrealistic. A length of 10 mm is plausible, therefore,
we do not remove any data here.
Y and Z have got some extreme outliers, namely width above 30 mmm.
That seems unlikely. These three datapoints will be removed.
Carat has got a large range and the distribution has got a long tail (right-skew).
Most diamonds are under 2.5 carats and the diamonds above this weight could distort our model.
The largest 1% of data will be ignored. We therefore filter diamonds above 99% quantile.
"""
index_diamond = df_diamond[ (df_diamond['y'] > 20) | (df_diamond['z'] > 10) | (df_diamond['carat'] > 2.5)].index
df_diamond.drop(index_diamond, inplace=True)
df_diamond.describe()
sns.displot(data=df_diamond, x='price',binwidth= 50)
sns.displot(data=df_diamond, x='carat',binwidth= 0.01)
#Price distribution also has a long tail. despite filtering out the largest diamonds.
#We also observe a price gap at ca. 1500 dollars, this could be an error in the data. We also notice, that price distribution and carat distribution are different, meaning that the price of a diamond is not a simple function of carat.
#Lets see the Boxplots of numerical variables with removed outliers:
# plot boxplot of numeric variables to visualize outliers
fig=plt.figure(figsize=(15,10))
plt.suptitle("Numeric variables: Box plots", size=20, weight='bold')
j=0
for i in int_cols:
ax=plt.subplot(331+j)
ax=sns.boxplot(data=df_diamond, x=i)
j=j+1
percentage_of_removed_data=(53794-53651)/53794 * 100
percentage_of_removed_data
#We have only removed 0.27% of all data due to outliers.
#This should not have a noticeable impact on the model building,
#therefore, we decided to keep this outliers removal method.
0.26582890285161914
#MB_ANFANG
# Untersuchung der metrischen Merkmale "price", "carat", "depth" und "table" auf Normalverteilung:
#Das Quantil-Quantil-Diagramm dient zur Prüfung von Variablen
# auf Normalverteilung.
# Es vergleicht die tatsächliche Verteilung einer Variable
# mit der idealtypischen Normalverteilung – mithilfe der Quantile.
# Letzteres ist eher unwichtig, da eigentlich nur ein Streudiagramm mit
# einer Gerade erzeugt und interpretiert werden muss.
# Für das Q-Q-Diagramm empfiehlt sich eine z-Standardisierung
# der betreffenden Variable.
import statsmodels.api as sm
import pylab as py
import scipy.stats as stats
import statistics as st
c_valuator = 1.5
data_price = stats.zscore(df_diamond.price) # Skalenniveau ( $, [326 ; 18823] )
max_sampleprice = max(data_price)
min_sampleprice = min(data_price)
third_pricequartiles = st.quantiles(data_price, n=4)
iqr_pricequartiles = third_pricequartiles[2] - third_pricequartiles[0]
data_carat = stats.zscore(df_diamond.carat) # Skalenniveau (Gewicht in ct, [0,2 ; 5.01] )
max_samplecarat = max(data_carat)
min_samplecarat = min(data_carat)
third_caratquartiles = st.quantiles(data_carat, n=4)
iqr_caratquartiles = third_caratquartiles[2] - third_caratquartiles[0]
data_depth = stats.zscore(df_diamond.depth) # Skalenniveau ( mm, [0 ; 31,8] )
max_sampledepth = max(data_depth)
min_sampledepth = min(data_depth)
third_depthquartiles = st.quantiles(data_depth, n=4)
iqr_depthquartiles = third_depthquartiles[2] - third_depthquartiles[0]
data_table = stats.zscore(df_diamond.table) # Skalenniveau (%, [43 ; 95] )
max_sampletable = max(data_table)
min_sampletable = min(data_table)
third_tablequartiles = st.quantiles(data_table, n=4)
iqr_tablequartiles = third_tablequartiles[2] - third_tablequartiles[0]
fig=plt.figure(figsize=(15,15))
ax = fig.add_subplot(2, 2, 1)
sm.graphics.qqplot(data_price,line='45', ax=ax)
ax.set_xlim(third_pricequartiles[0]-c_valuator*iqr_pricequartiles, third_pricequartiles[2]+c_valuator*iqr_pricequartiles)
#ax.set_ylim(min_sampleprice, max_sampleprice)
left = -1.5 #x coordinate for text insert
top = ax.get_ylim()[1] * 0.75
txt = ax.text(left, top, "QQ-Plot für Merkmal 'price' ", verticalalignment='top')
txt.set_bbox(dict(facecolor='k', alpha=0.1))
ax = fig.add_subplot(2, 2, 2)
sm.graphics.qqplot(data_carat, line='s', ax=ax)
ax.set_xlim(third_caratquartiles[0]-c_valuator*iqr_caratquartiles, third_caratquartiles[2]+c_valuator*iqr_caratquartiles)
#ax.set_ylim(min_samplecarat, max_samplecarat)
left = -1 #x coordinate for text insert
top = ax.get_ylim()[1] * 0.75
txt = ax.text(left, top,"QQ-Plot für Merkmal 'carat' ", verticalalignment='top')
txt.set_bbox(dict(facecolor='k', alpha=0.1))
ax = fig.add_subplot(2, 2, 3)
sm.graphics.qqplot(data_depth, line='45', fit=True, ax=ax)
ax.set_xlim(third_depthquartiles[0]-c_valuator*iqr_depthquartiles, third_depthquartiles[2]+c_valuator*iqr_depthquartiles)
#ax.set_ylim(min_sampledepth, max_sampledepth)
left = -1 #x coordinate for text insert
top = ax.get_ylim()[1] * 0.75
txt = ax.text(left, top,"QQ-Plot für Merkmal 'depth' ", verticalalignment='top')
txt.set_bbox(dict(facecolor='k', alpha=0.1))
ax = fig.add_subplot(2, 2, 4)
sm.graphics.qqplot(data_table, line='45', fit=True, ax=ax)
ax.set_xlim(third_tablequartiles[0]-c_valuator*iqr_tablequartiles, third_tablequartiles[2]+c_valuator*iqr_tablequartiles)
#ax.set_ylim(min_sampletable, max_sampletable)
left = -1 #x coordinate for text insert
top = ax.get_ylim()[1] * 0.75
txt = ax.text(left, top, "QQ-Plot für Merkmal 'table' ", verticalalignment='top')
txt.set_bbox(dict(facecolor='k', alpha=0.1))
# Hier ist es wichtig, dass die Beobachtungen (=Punkte) möglichst
# auf oder nahe der idealen Normalverteilung (= Gerade) liegen.
# Je weiter weg Punkte sind, desto eher bezeichnet man sie als "Ausreißer"
# bzw. sind sie von einer idealen Normalverteilung weg.
# Eine genaue Vorgehensweise, was normalverteilt ist und was nicht,
# gibt es allerdings nicht.
# Wenn die Mehrheit der Punkte auf oder nahe der Gerade liegen,
# ist dies meist ausreichend, um von einer hinreichenden
# Normalverteilung auszugehen.
# Exakt einer Normalverteilung wird sowieso fast nie eine Verteilung entsprechen,
# weswegen der Anwender hier auch mitunter etwas pragmatischer sein kann.
# Ergebnis: Für die relevanten Merkmale "carat" und "price" liegen keine Normaverteilungen vor.
#MB_ENDE
#MB_Anfang
# Untersuchung der ordinalen Merkmale "cut", "color" und "clarity" mit Pareto-Charts:
from matplotlib.ticker import PercentFormatter
cat_cols=df_diamond.select_dtypes(include='object').columns.to_list()
sample1 = df_diamond.copy(deep=True)
sample2 = sample1[cat_cols[0]].value_counts()
sample3 = pd.DataFrame(sample2)
sample3['cut_art'] = sample2.index
sample3.rename(columns={'cut': 'cut_absolute'}, inplace=True)
sample3 = sample3.sort_values(by='cut_absolute',ascending=False)
sample3['cut_percentage'] = (sample3['cut_absolute'].cumsum()) / (sample3['cut_absolute'].sum())*100
fig, ax = plt.subplots()
ax.bar(sample3['cut_art'], sample3['cut_absolute'], color="C0")
ax2 = ax.twinx()
ax2.plot(sample3['cut_art'], sample3['cut_percentage'], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
left = 2.5 #x coordinate for text insert
top = ax.get_ylim()[1] * 0.75
txt = ax.text(left, top,"Pareto -'cut' ", verticalalignment='top')
plt.show()
bsample1 = df_diamond.copy(deep=True)
bsample2 = bsample1[cat_cols[1]].value_counts()
bsample3 = pd.DataFrame(bsample2)
bsample3['color_art'] = bsample2.index
bsample3.rename(columns={'color': 'color_absolute'}, inplace=True)
bsample3 = bsample3.sort_values(by='color_absolute',ascending=False)
bsample3['color_percentage'] = (bsample3['color_absolute'].cumsum()) / (bsample3['color_absolute'].sum())*100
bfig, bax = plt.subplots()
bax.bar(bsample3['color_art'], bsample3['color_absolute'], color="C0")
bax2 = bax.twinx()
bax2.plot(bsample3['color_art'], bsample3['color_percentage'], color="C1", marker="D", ms=7)
bax2.yaxis.set_major_formatter(PercentFormatter())
bax.tick_params(axis="y", colors="C0")
bax2.tick_params(axis="y", colors="C1")
left = 4.5 #x coordinate for text insert
top = bax.get_ylim()[1] * 0.75
txt = bax.text(left, top,"Pareto -'color' ", verticalalignment='top')
plt.show()
csample1 = df_diamond.copy(deep=True)
csample2 = csample1[cat_cols[2]].value_counts()
csample3 = pd.DataFrame(csample2)
csample3['clarity_art'] = csample2.index
csample3.rename(columns={'clarity': 'clarity_absolute'}, inplace=True)
csample3 = csample3.sort_values(by='clarity_absolute',ascending=False)
csample3['clarity_percentage'] = (csample3['clarity_absolute'].cumsum()) / (csample3['clarity_absolute'].sum())*100
cfig, cax = plt.subplots()
cax.bar(csample3['clarity_art'], csample3['clarity_absolute'], color="C0")
cax2 = cax.twinx()
cax2.plot(csample3['clarity_art'], csample3['clarity_percentage'], color="C1", marker="D", ms=7)
cax2.yaxis.set_major_formatter(PercentFormatter())
cax.tick_params(axis="y", colors="C0")
cax2.tick_params(axis="y", colors="C1")
left = 4.5 #x coordinate for text insert
top = cax.get_ylim()[1] * 0.75
txt = cax.text(left, top,"Pareto -'clarity' ", verticalalignment='top')
plt.show()
#absfreq = df_diamond.groupby(['color','cut', 'clarity']).size()
#absfreq
#MB_Ende
#MB_ANFANG
# Beispiel:
# Die Ausprägung/Bewertung "ideal" des Merkmals "cut" beinhaltet über 40% aller Schliffe.
# Die Ausprägungen/Bewertungen "ideal" und "premium" des Merkmals "cut" beinhalteten zusammen
# fast 70% aller Schliffe....
# Auf der Suche nach Auffälligkeiten, Erklärungsversuchen und Clustern:
import plotly.express as px
import plotly.graph_objects as go
cat_cols=df_diamond.select_dtypes(include='object').columns.to_list()
sample_thing = df_diamond.copy(deep=True)
figure = px.scatter(data_frame = sample_thing,
x="depth",
y="table",
size="price",
color= "cut",
trendline="ols")
figure.show()
# Auf der Suche nach Auffälligkeiten, Erklärungsversuchen und Clustern:
figure = px.scatter(data_frame = sample_thing,
x="depth",
y="table",
size="price",
color= "color",
trendline="ols")
figure.show()
# Auf der Suche nach Auffälligkeiten, Erklärungsversuchen und Clustern:
figure = px.scatter(data_frame = sample_thing,
x="depth",
y="table",
size="price",
color= "clarity",
trendline="ols")
figure.show()
# Auf der Suche nach Auffälligkeiten, Erklärungsversuchen und Clustern:
figure = px.scatter(data_frame = sample_thing,
x="depth",
y="table",
size="price",
color= "carat",
trendline="ols")
figure.show()
# Auf der Suche nach Auffälligkeiten, Erklärungsversuchen und Clustern:
sample_thing["size"] = sample_thing["x"] * sample_thing["y"] * sample_thing["z"]
figure = px.scatter(data_frame = sample_thing,
x="size",
y="carat",
size="price",
color= "cut",
trendline="ols")
figure.show()
# Wie überraschend: Zwischen Volumen und carat gibt es einen linearen gleichgerichteten Zusammenhang
# Auf der Suche nach Auffälligkeiten, Erklärungsversuchen und Clustern:
sample_thing["size"] = sample_thing["x"] * sample_thing["y"] * sample_thing["z"]
figure = px.scatter(data_frame = sample_thing,
x="size",
y="carat",
size="price",
color= "color",
trendline="ols")
figure.show()
# Wie überraschend: Zwischen Volumen und carat gibt es einen linearen gleichgerichteten Zusammenhang
# cut, price, color:
fig = px.box(sample_thing,
x="cut",
y="price",
color="color")
fig.show()
# cut, price, clarity:
fig = px.box(sample_thing,
x="cut",
y="price",
color="clarity")
fig.show()
# cut, price, carat:
fig = px.box(sample_thing,
x="cut",
y="price",
color="carat")
fig.show()
# Hier erkennt man, daß das Merkmal 'price' sich nicht ausschließlich durch 'cut' und 'carat'
# erklären läßt. Es läßt sich nicht ausschließen, daß cut, color, clarity und carat doch die Regressoren
# für den Regressanden 'price' sind.
# Die vier c's als Regressoren und 'price' als Regressand:
fig = px.scatter_3d(sample_thing,
x='carat',
y='size',
z='price',
color= "cut")
fig.show()
fig = px.scatter_3d(sample_thing,
x='carat',
y='size',
z='price',
color= "color")
fig.show()
fig = px.scatter_3d(sample_thing,
x='carat',
y='size',
z='price',
color= "clarity")
fig.show()
# ERGEBNIS:
# Durch die Erkenntnisse aus der Explorativen Datenanalyse ist es gerechtfertigt,
# daß wir folgende maschinelle Verfahren einsetzen:
# LinearRegression, Lasso, Ridge,RandomForestRegressor,KNeighborsRegressor, DecisionTreeRegressor
# MB_ENDE
# Get list of categorical variables
s = (df_diamond.dtypes =="object")
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)
Categorical variables: ['cut', 'color', 'clarity']
label_encoder = LabelEncoder()
df_diamond['cut'] = label_encoder.fit_transform(df_diamond['cut'])
df_diamond['color'] = label_encoder.fit_transform(df_diamond['color'])
df_diamond['clarity'] = label_encoder.fit_transform(df_diamond['clarity'])
df_diamond.head()
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | 2 | 1 | 3 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 0.21 | 3 | 1 | 2 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 0.23 | 1 | 1 | 4 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 0.29 | 3 | 5 | 5 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 0.31 | 1 | 6 | 3 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
plt.figure(figsize=(12,8))
sns.heatmap(df_diamond.corr(),
annot=True,
cmap='RdBu_r')
<AxesSubplot:>
g = sns.PairGrid(df_diamond)
g.map(sns.scatterplot)
<seaborn.axisgrid.PairGrid at 0x250c8d7dd90>
df_diamond.columns
Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
'z'],
dtype='object')
X = df_diamond.drop(['price'], axis=1)
y = df_diamond['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=66)
X_train_unskaliert = X_train
X_test_unskaliert = X_test
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
Schritte bei der Modellerstellung und beim Testen
# Building pipelins of standard scaler and model for varios regressors.
pipeline_slr = Pipeline([("scalar1",StandardScaler()), ("lr",LinearRegression())])
pipeline_sla = Pipeline([("scalar2",StandardScaler()), ("la",Lasso(tol=0.01))])
pipeline_sri = Pipeline([("scalar3",StandardScaler()), ("ri",Ridge())])
pipeline_srf = Pipeline([("scalar4",StandardScaler()), ("rf",RandomForestRegressor())])
pipeline_skn = Pipeline([("scalar5",StandardScaler()), ("kn",KNeighborsRegressor())])
pipeline_sdt = Pipeline([("scalar6",StandardScaler()),("dt",DecisionTreeRegressor())])
# List of all the pipelines
pipelines_s = [pipeline_slr, pipeline_sla, pipeline_sri, pipeline_srf, pipeline_skn, pipeline_sdt]
# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "LinearRegression", 1: "Lasso", 2: "Ridge", 3: "RandomForestRegressor", 4: "KNeighborsRegressor", 5: "DecisionTreeRegressor"}
# Fit the pipelines get the scores.
scores_s = []
for i, model in enumerate(pipelines_s):
model.fit(X_train_unskaliert, y_train)
score = model.score(X_train_unskaliert, y_train)
if score > 0.99:
result = "Possible Overfit"
elif score < 0.7:
result = "Underfit"
else:
result = "Reasonable"
score_data = [pipe_dict[i], round(score, 4), result]
scores_s.append(score_data)
# Scores to dataframe
df_scores_s = pd.DataFrame(scores_s)
df_scores_s.columns = ["Model Name", "Score", "Result"]
df_scores_s.sort_values(by="Score", ascending=False)
| Model Name | Score | Result | |
|---|---|---|---|
| 5 | DecisionTreeRegressor | 1.0000 | Possible Overfit |
| 3 | RandomForestRegressor | 0.9974 | Possible Overfit |
| 4 | KNeighborsRegressor | 0.9736 | Reasonable |
| 0 | LinearRegression | 0.8946 | Reasonable |
| 2 | Ridge | 0.8946 | Reasonable |
| 1 | Lasso | 0.8944 | Reasonable |
# Building pipelins of standard scaler and model for varios regressors.
pipeline_mlr = Pipeline([("scalar1",MinMaxScaler()), ("lr",LinearRegression())])
pipeline_mla = Pipeline([("scalar2",MinMaxScaler()), ("la",Lasso(tol=0.001))])
pipeline_mri = Pipeline([("scalar3",MinMaxScaler()), ("ri",Ridge())])
pipeline_mrf = Pipeline([("scalar4",MinMaxScaler()), ("rf",RandomForestRegressor())])
pipeline_mkn = Pipeline([("scalar5",MinMaxScaler()), ("kn",KNeighborsRegressor())])
pipeline_mdt = Pipeline([("scalar6",MinMaxScaler()),("dt",DecisionTreeRegressor())])
# List of all the pipelines
pipelines_m = [pipeline_mlr, pipeline_mla, pipeline_mri, pipeline_mrf, pipeline_mkn, pipeline_mdt]
# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "LinearRegression", 1: "Lasso", 2: "Ridge", 3: "RandomForestRegressor", 4: "KNeighborsRegressor", 5: "DecisionTreeRegressor"}
# Fit the pipelines get the scores.
scores_m = []
for i, model in enumerate(pipelines_m):
model.fit(X_train_unskaliert, y_train)
score = model.score(X_train_unskaliert, y_train)
if score > 0.99:
result = "Possible Overfit"
elif score < 0.7:
result = "Underfit"
else:
result = "Reasonable"
score_data = [pipe_dict[i], round(score, 4), result]
scores_m.append(score_data)
# Scores to dataframe
df_scores_m = pd.DataFrame(scores_m)
df_scores_m.columns = ["Model Name", "Score", "Result"]
df_scores_m.sort_values(by="Score", ascending=False)
| Model Name | Score | Result | |
|---|---|---|---|
| 5 | DecisionTreeRegressor | 1.0000 | Possible Overfit |
| 3 | RandomForestRegressor | 0.9974 | Possible Overfit |
| 4 | KNeighborsRegressor | 0.9836 | Reasonable |
| 0 | LinearRegression | 0.8946 | Reasonable |
| 2 | Ridge | 0.8945 | Reasonable |
| 1 | Lasso | 0.8935 | Reasonable |
# List of all the pipelines (Linear, Lasso, Ridge)
lr_pipelines = [pipeline_mlr, pipeline_mla, pipeline_mri]
# Dictionary of pipelines and model types for ease of reference
lr_dict = {0: "LinearRegression", 1: "Lasso", 2: "Ridge"}
# Fit the pipelines
scores2 = []
for i, model in enumerate(lr_pipelines):
model.fit(X_train, y_train)
cvs = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)**0.5
score_r2 = r2_score(y_test, y_pred)
score_data2 = [pipe_dict[i], round(score_r2, 4), round(cvs.mean(), 4), round(mse, 4), round(mae, 4), round(rmse, 4)]
scores2.append(score_data2)
# Scores to dataframe
df_scores2 = pd.DataFrame(scores2)
df_scores2.columns = ["Model Name", "Score_R2", "Score_CV", "MSE", "MAE", "RMSE"]
df_scores2.sort_values(by="Score_R2", ascending=False)
| Model Name | Score_R2 | Score_CV | MSE | MAE | RMSE | |
|---|---|---|---|---|---|---|
| 0 | LinearRegression | 0.8931 | 0.8945 | 1.686091e+06 | 834.9792 | 1298.4958 |
| 2 | Ridge | 0.8928 | 0.8944 | 1.690209e+06 | 835.9587 | 1300.0804 |
| 1 | Lasso | 0.8918 | 0.8934 | 1.706253e+06 | 837.3424 | 1306.2363 |
# Algorithmen intensiver testen : LinearRegression
param_grid = [
{"fit_intercept": [True, False]}
]
lr_reg = LinearRegression()
grid_search = GridSearchCV(lr_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
print("GS LR Best parameters: ", grid_search.best_params_)
print("GS LR Best estimator: ", grid_search.best_estimator_)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
GS LR Best parameters: {'fit_intercept': True}
GS LR Best estimator: LinearRegression()
1281.1199825003957 {'fit_intercept': True}
4112.860605108131 {'fit_intercept': False}
# Algorithmen intensiver testen : Lasso
param_grid = [
{'alpha': [1, 5, 10], "fit_intercept": [True, False]}
]
lasso_reg = Lasso(tol=0.1)
grid_search = GridSearchCV(lasso_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
print("GS Lasso Best parameters: ", grid_search.best_params_)
print("GS Lasso Best estimator: ", grid_search.best_estimator_)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
GS Lasso Best parameters: {'alpha': 1, 'fit_intercept': True}
GS Lasso Best estimator: Lasso(alpha=1, tol=0.1)
1353.6018795028754 {'alpha': 1, 'fit_intercept': True}
4113.493726461718 {'alpha': 1, 'fit_intercept': False}
1354.2433911715011 {'alpha': 5, 'fit_intercept': True}
4115.0055028011675 {'alpha': 5, 'fit_intercept': False}
1354.6454175946317 {'alpha': 10, 'fit_intercept': True}
4117.912236032825 {'alpha': 10, 'fit_intercept': False}
# Algorithmen intensiver testen : Ridge
param_grid = [
{'alpha': [1, 5, 10], "fit_intercept": [True, False]}
]
ridge_reg = Ridge()
grid_search = GridSearchCV(ridge_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
print("GS Ridge Best parameters: ", grid_search.best_params_)
print("GS Ridge Best estimator: ", grid_search.best_estimator_)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
GS Ridge Best parameters: {'alpha': 1, 'fit_intercept': True}
GS Ridge Best estimator: Ridge(alpha=1)
1281.1115726341545 {'alpha': 1, 'fit_intercept': True}
4112.85066056173 {'alpha': 1, 'fit_intercept': False}
1281.1548993241327 {'alpha': 5, 'fit_intercept': True}
4112.839266732025 {'alpha': 5, 'fit_intercept': False}
1281.3296734376822 {'alpha': 10, 'fit_intercept': True}
4112.870042273604 {'alpha': 10, 'fit_intercept': False}
Best Model seems to be Ridge with 1281.11 nmse and alpha = 1, fit_intercept = True
# System Testen mit dem Testset
r_test = Ridge(alpha=1, fit_intercept=True)
r_test.fit(X_train, y_train)
# Prediction
ypred_test = r_test.predict(X_test)
# Checking Accuracy
print("Linear Model / Ridge Test Result: ", round(r2_score(y_test, ypred_test), 4))
Linear Model / Ridge Test Result: 0.8931
def evaluate_model(x_test=None,y_test=None,model=None,name='Linear Regression',color='red'):
#1. make predictions
#2. evaluate the model
#3. plot the regression line
#Inputs:X_test,y_test, model object, model name
predicted = model.predict(X_test)
actual=y_test
metric_scores(actual,predicted,name)
plt.figure(figsize=(12,8))
plt.scatter(actual, predicted, c=color)
mxp,mxa,mnp,mna = max(predicted),max(actual),np.min(predicted),np.min(actual)
p1 = mxp if mxp>mxa else mxa
p2 = mnp if mnp<mna else mna
plt.plot([p1, p2], [p1, p2],'#000066')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.title(name,fontsize=30)
plt.axis('equal')
plt.show()
d={}
def metric_scores(actual,predicted,name):
# Evaluate and store the evaluation metric scores.
# Inputs: y_test, y_pred, model name
mae = mean_absolute_error(actual,predicted)
mse = mean_squared_error(actual,predicted)
mape = mean_absolute_percentage_error(actual,predicted)
r2 = r2_score(actual,predicted)
d[name]=[mae,mse,mape,r2]
print('Mean Absolute Error is {:.3f}'.format(mae))
print()
print('Mean Squared Error is {:.3f}'.format(mse))
print()
print('Mean Absolute Percentage Error is {:.3f}'.format(mape))
print()
print('R Squared Error is {:.3f}'.format(r2))
print()
# Algorithmen intensiver testen -> Random forest.
for n_estimator in 5,10, 50, 100,500:
print("--------------------------------------------------------------------------------------------------------------")
print('Random Forest Model with {} estimators)'.format(n_estimator))
print("--------------------------------------------------------------------------------------------------------------")
model=RandomForestRegressor(n_estimators=n_estimator) #, verbose=4)
model.fit(X_train, y_train)
title = 'RandomForest; no_estimators: {}.'.format(np.round(n_estimator),3)
evaluate_model(x_test=X_test,y_test=y_test,model=model,name=title,color='lightgray')
cr_val_score=cross_val_score(model, X_train, y_train, cv=3)
mod_score = model.score(X_train, y_train)
print("Model score: ", mod_score)
print("Cross validation scores: ",cr_val_score)
print("Accuracy of model with Train data: {}".format(np.round(model.score(X_train, y_train),3)))
print("Accuracy of model with Test data: {}".format(np.round(model.score(X_test, y_test),3)))
print()
print()
-------------------------------------------------------------------------------------------------------------- Random Forest Model with 5 estimators) -------------------------------------------------------------------------------------------------------------- Mean Absolute Error is 292.373 Mean Squared Error is 336814.540 Mean Absolute Percentage Error is 0.072 R Squared Error is 0.979
Model score: 0.995175410545503 Cross validation scores: [0.97730635 0.97594457 0.97475489] Accuracy of model with Train data: 0.995 Accuracy of model with Test data: 0.979 -------------------------------------------------------------------------------------------------------------- Random Forest Model with 10 estimators) -------------------------------------------------------------------------------------------------------------- Mean Absolute Error is 278.774 Mean Squared Error is 306917.472 Mean Absolute Percentage Error is 0.068 R Squared Error is 0.981
Model score: 0.9964243323876145 Cross validation scores: [0.978957 0.97732469 0.97740548] Accuracy of model with Train data: 0.996 Accuracy of model with Test data: 0.981 -------------------------------------------------------------------------------------------------------------- Random Forest Model with 50 estimators) -------------------------------------------------------------------------------------------------------------- Mean Absolute Error is 267.629 Mean Squared Error is 289103.750 Mean Absolute Percentage Error is 0.065 R Squared Error is 0.982
Model score: 0.9972382637445087 Cross validation scores: [0.98056048 0.97910702 0.9789325 ] Accuracy of model with Train data: 0.997 Accuracy of model with Test data: 0.982 -------------------------------------------------------------------------------------------------------------- Random Forest Model with 100 estimators) -------------------------------------------------------------------------------------------------------------- Mean Absolute Error is 265.244 Mean Squared Error is 285306.845 Mean Absolute Percentage Error is 0.064 R Squared Error is 0.982
Model score: 0.9973908235094402 Cross validation scores: [0.9808535 0.97934117 0.97920626] Accuracy of model with Train data: 0.997 Accuracy of model with Test data: 0.982 -------------------------------------------------------------------------------------------------------------- Random Forest Model with 500 estimators) -------------------------------------------------------------------------------------------------------------- Mean Absolute Error is 264.306 Mean Squared Error is 283920.197 Mean Absolute Percentage Error is 0.064 R Squared Error is 0.982
Model score: 0.9974764759010886 Cross validation scores: [0.98102582 0.97945304 0.97952027] Accuracy of model with Train data: 0.997 Accuracy of model with Test data: 0.982
#Find best parameters for the model
parameters = {'max_depth': [7,50,100]
}
model=RandomForestRegressor(n_estimators=50)
model.fit(X_train, y_train)
gridforest = GridSearchCV(model, parameters, cv=3, scoring='neg_mean_squared_error', return_train_score=True, verbose=1)
gridforest.fit(X_train, y_train)
gridforest.best_params_
Fitting 3 folds for each of 3 candidates, totalling 9 fits
{'max_depth': 100}
gridforest.best_estimator_
RandomForestRegressor(max_depth=100, n_estimators=50)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor(max_depth=100, n_estimators=50)
# Decision Tree oberflächlich testen
dec_tree = DecisionTreeRegressor(max_depth=10, random_state=42)
dec_tree.fit(X_train, y_train)
# Prediction
ypred = dec_tree.predict(X_train)
# Checking Accuracy
print(r2_score(y_train, ypred))
0.9810915286864138
max_dep = [6,7,8,9,10,11,12,13]
criterions = ['squared_error', 'friedman_mse', 'poisson'] # 'absolute_error'-> langsam
dep_score = []
index = 0
df_scores = pd.DataFrame(columns=["MaxDepth", "Criterion", "Score"])
for i in max_dep:
for j in criterions:
dec_tree = DecisionTreeRegressor(criterion =j, max_depth=i, random_state=42)
dec_tree.fit(X_train, y_train)
ypred2 = dec_tree.predict(X_train)
index += 1
df_scores.loc[index] = [i , j, round(r2_score(y_train, ypred2),5)]
df_scores.sort_values(by=['Criterion','Score'], ascending=False)
| MaxDepth | Criterion | Score | |
|---|---|---|---|
| 22 | 13 | squared_error | 0.98973 |
| 19 | 12 | squared_error | 0.98729 |
| 16 | 11 | squared_error | 0.98444 |
| 13 | 10 | squared_error | 0.98109 |
| 10 | 9 | squared_error | 0.97643 |
| 7 | 8 | squared_error | 0.96954 |
| 4 | 7 | squared_error | 0.95905 |
| 1 | 6 | squared_error | 0.94380 |
| 24 | 13 | poisson | 0.98893 |
| 21 | 12 | poisson | 0.98638 |
| 18 | 11 | poisson | 0.98348 |
| 15 | 10 | poisson | 0.97986 |
| 12 | 9 | poisson | 0.97454 |
| 9 | 8 | poisson | 0.96715 |
| 6 | 7 | poisson | 0.95461 |
| 3 | 6 | poisson | 0.94173 |
| 23 | 13 | friedman_mse | 0.98973 |
| 20 | 12 | friedman_mse | 0.98729 |
| 17 | 11 | friedman_mse | 0.98444 |
| 14 | 10 | friedman_mse | 0.98109 |
| 11 | 9 | friedman_mse | 0.97643 |
| 8 | 8 | friedman_mse | 0.96954 |
| 5 | 7 | friedman_mse | 0.95905 |
| 2 | 6 | friedman_mse | 0.94380 |
plt.figure(figsize=(14,6))
ax = sns.barplot(data=df_scores, x="MaxDepth", y="Score", hue="Criterion")
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
for container in ax.containers:
ax.bar_label(container, fmt='%.3f')
#from sklearn import tree
#tree.plot_tree(dec_tree)
# plt.figure(figsize=(12,8))
# tree.plot_tree(dec_tree.fit(X_train, y_train))
def display_scores(scores):
print(f"Scores: {np.round(scores, 4)}")
print(f"Mean: {np.round(scores.mean(), 4)}")
print(f"Standard deviation: {np.round(scores.std(), 4)}")
scores = cross_val_score(dec_tree, X_train, y_train, cv=10)
scores.mean()
0.9737318636857756
# print('{}: score: {:.2f}, std_dev:{:.2f}'.format(name,cv_results.mean(), cv_results.std()) )
param_grid = [
{'max_depth': [7,8,9,10,11,12,13],
'criterion': ['squared_error', 'friedman_mse', 'poisson'],
"splitter": ["best","random"]
}]
dectree_grids = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(dectree_grids, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
param_grid=[{'criterion': ['squared_error', 'friedman_mse',
'poisson'],
'max_depth': [7, 8, 9, 10, 11, 12, 13],
'splitter': ['best', 'random']}],
return_train_score=True, scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=42),
param_grid=[{'criterion': ['squared_error', 'friedman_mse',
'poisson'],
'max_depth': [7, 8, 9, 10, 11, 12, 13],
'splitter': ['best', 'random']}],
return_train_score=True, scoring='neg_mean_squared_error')DecisionTreeRegressor(random_state=42)
DecisionTreeRegressor(random_state=42)
grid_search.best_params_
{'criterion': 'squared_error', 'max_depth': 11, 'splitter': 'best'}
grid_search.best_estimator_
DecisionTreeRegressor(max_depth=11, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeRegressor(max_depth=11, random_state=42)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
811.203733509865 {'criterion': 'squared_error', 'max_depth': 7, 'splitter': 'best'}
1131.698211621473 {'criterion': 'squared_error', 'max_depth': 7, 'splitter': 'random'}
730.5231251138903 {'criterion': 'squared_error', 'max_depth': 8, 'splitter': 'best'}
1021.6581293356352 {'criterion': 'squared_error', 'max_depth': 8, 'splitter': 'random'}
673.8157933337246 {'criterion': 'squared_error', 'max_depth': 9, 'splitter': 'best'}
964.3877903436752 {'criterion': 'squared_error', 'max_depth': 9, 'splitter': 'random'}
644.2120586548302 {'criterion': 'squared_error', 'max_depth': 10, 'splitter': 'best'}
861.3205161895442 {'criterion': 'squared_error', 'max_depth': 10, 'splitter': 'random'}
637.9224778129726 {'criterion': 'squared_error', 'max_depth': 11, 'splitter': 'best'}
780.1462721761052 {'criterion': 'squared_error', 'max_depth': 11, 'splitter': 'random'}
642.8616216382479 {'criterion': 'squared_error', 'max_depth': 12, 'splitter': 'best'}
728.1017596941219 {'criterion': 'squared_error', 'max_depth': 12, 'splitter': 'random'}
657.944363857527 {'criterion': 'squared_error', 'max_depth': 13, 'splitter': 'best'}
697.492121320476 {'criterion': 'squared_error', 'max_depth': 13, 'splitter': 'random'}
811.203733509865 {'criterion': 'friedman_mse', 'max_depth': 7, 'splitter': 'best'}
1131.698211621473 {'criterion': 'friedman_mse', 'max_depth': 7, 'splitter': 'random'}
730.5231251138903 {'criterion': 'friedman_mse', 'max_depth': 8, 'splitter': 'best'}
1021.6581293356352 {'criterion': 'friedman_mse', 'max_depth': 8, 'splitter': 'random'}
673.8157933337246 {'criterion': 'friedman_mse', 'max_depth': 9, 'splitter': 'best'}
964.3877903436752 {'criterion': 'friedman_mse', 'max_depth': 9, 'splitter': 'random'}
644.2120586548302 {'criterion': 'friedman_mse', 'max_depth': 10, 'splitter': 'best'}
861.3205161895442 {'criterion': 'friedman_mse', 'max_depth': 10, 'splitter': 'random'}
637.9224778129726 {'criterion': 'friedman_mse', 'max_depth': 11, 'splitter': 'best'}
780.1462721761052 {'criterion': 'friedman_mse', 'max_depth': 11, 'splitter': 'random'}
642.7749118689634 {'criterion': 'friedman_mse', 'max_depth': 12, 'splitter': 'best'}
728.1017596941219 {'criterion': 'friedman_mse', 'max_depth': 12, 'splitter': 'random'}
657.944363857527 {'criterion': 'friedman_mse', 'max_depth': 13, 'splitter': 'best'}
697.492121320476 {'criterion': 'friedman_mse', 'max_depth': 13, 'splitter': 'random'}
864.6578198101585 {'criterion': 'poisson', 'max_depth': 7, 'splitter': 'best'}
1138.6354538086935 {'criterion': 'poisson', 'max_depth': 7, 'splitter': 'random'}
754.3182709588007 {'criterion': 'poisson', 'max_depth': 8, 'splitter': 'best'}
1106.7746319994892 {'criterion': 'poisson', 'max_depth': 8, 'splitter': 'random'}
689.0435166925455 {'criterion': 'poisson', 'max_depth': 9, 'splitter': 'best'}
910.2823922646587 {'criterion': 'poisson', 'max_depth': 9, 'splitter': 'random'}
656.4843009874584 {'criterion': 'poisson', 'max_depth': 10, 'splitter': 'best'}
898.5951012542447 {'criterion': 'poisson', 'max_depth': 10, 'splitter': 'random'}
638.6215828535659 {'criterion': 'poisson', 'max_depth': 11, 'splitter': 'best'}
809.2286888559626 {'criterion': 'poisson', 'max_depth': 11, 'splitter': 'random'}
642.4092445957064 {'criterion': 'poisson', 'max_depth': 12, 'splitter': 'best'}
716.0277332017755 {'criterion': 'poisson', 'max_depth': 12, 'splitter': 'random'}
652.8037200009762 {'criterion': 'poisson', 'max_depth': 13, 'splitter': 'best'}
709.7080383373989 {'criterion': 'poisson', 'max_depth': 13, 'splitter': 'random'}
pd.DataFrame(grid_search.cv_results_)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_criterion | param_max_depth | param_splitter | params | split0_test_score | split1_test_score | ... | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | split3_train_score | split4_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.260450 | 0.066827 | 0.004801 | 5.116078e-03 | squared_error | 7 | best | {'criterion': 'squared_error', 'max_depth': 7,... | -6.353755e+05 | -6.134599e+05 | ... | -6.580515e+05 | 40728.390457 | 28 | -5.989101e+05 | -6.009869e+05 | -5.902879e+05 | -6.246628e+05 | -6.287177e+05 | -6.087131e+05 | 15164.820579 |
| 1 | 0.094996 | 0.044803 | 0.002602 | 4.912270e-04 | squared_error | 7 | random | {'criterion': 'squared_error', 'max_depth': 7,... | -1.042932e+06 | -1.515406e+06 | ... | -1.280741e+06 | 165953.895733 | 40 | -1.043233e+06 | -1.542838e+06 | -1.311143e+06 | -1.081811e+06 | -1.275237e+06 | -1.250853e+06 | 179535.841676 |
| 2 | 0.182997 | 0.017353 | 0.002401 | 8.006339e-04 | squared_error | 8 | best | {'criterion': 'squared_error', 'max_depth': 8,... | -5.250198e+05 | -5.033210e+05 | ... | -5.336640e+05 | 21357.133435 | 22 | -4.635733e+05 | -4.604083e+05 | -4.573478e+05 | -4.600744e+05 | -4.643482e+05 | -4.611504e+05 | 2540.240458 |
| 3 | 0.050598 | 0.008935 | 0.002001 | 9.221592e-07 | squared_error | 8 | random | {'criterion': 'squared_error', 'max_depth': 8,... | -8.613845e+05 | -1.143016e+06 | ... | -1.043785e+06 | 180991.168588 | 37 | -8.541742e+05 | -1.091291e+06 | -7.280620e+05 | -1.280387e+06 | -1.023361e+06 | -9.954550e+05 | 191106.773424 |
| 4 | 0.212659 | 0.078944 | 0.002802 | 7.528810e-04 | squared_error | 9 | best | {'criterion': 'squared_error', 'max_depth': 9,... | -4.587310e+05 | -4.254228e+05 | ... | -4.540277e+05 | 16557.051955 | 13 | -3.596882e+05 | -3.608855e+05 | -3.583801e+05 | -3.513679e+05 | -3.636445e+05 | -3.587932e+05 | 4099.281421 |
| 5 | 0.151877 | 0.025789 | 0.011200 | 1.455165e-02 | squared_error | 9 | random | {'criterion': 'squared_error', 'max_depth': 9,... | -8.468722e+05 | -9.054632e+05 | ... | -9.300438e+05 | 78985.320604 | 35 | -8.957059e+05 | -8.241726e+05 | -7.643624e+05 | -8.774860e+05 | -8.951914e+05 | -8.513837e+05 | 50750.472838 |
| 6 | 0.524401 | 0.326708 | 0.002803 | 7.477014e-04 | squared_error | 10 | best | {'criterion': 'squared_error', 'max_depth': 10... | -4.236699e+05 | -3.850778e+05 | ... | -4.150092e+05 | 21487.378301 | 7 | -2.915339e+05 | -2.879010e+05 | -2.855552e+05 | -2.821124e+05 | -2.924004e+05 | -2.879006e+05 | 3806.661700 |
| 7 | 0.066124 | 0.016504 | 0.002601 | 4.901948e-04 | squared_error | 10 | random | {'criterion': 'squared_error', 'max_depth': 10... | -8.149111e+05 | -5.296891e+05 | ... | -7.418730e+05 | 118427.874136 | 30 | -6.722672e+05 | -4.490996e+05 | -6.119588e+05 | -6.273953e+05 | -7.392641e+05 | -6.199970e+05 | 96210.293020 |
| 8 | 0.211778 | 0.024429 | 0.003204 | 9.890060e-04 | squared_error | 11 | best | {'criterion': 'squared_error', 'max_depth': 11... | -4.062283e+05 | -3.847816e+05 | ... | -4.069451e+05 | 18232.185792 | 1 | -2.401105e+05 | -2.348048e+05 | -2.320764e+05 | -2.276072e+05 | -2.361896e+05 | -2.341577e+05 | 4176.694950 |
| 9 | 0.085692 | 0.005233 | 0.003801 | 1.166589e-03 | squared_error | 11 | random | {'criterion': 'squared_error', 'max_depth': 11... | -7.041491e+05 | -6.139607e+05 | ... | -6.086282e+05 | 56655.047708 | 25 | -5.995078e+05 | -4.602772e+05 | -3.986709e+05 | -3.637453e+05 | -4.789362e+05 | -4.602275e+05 | 81051.878966 |
| 10 | 0.435084 | 0.129270 | 0.007200 | 5.192799e-03 | squared_error | 12 | best | {'criterion': 'squared_error', 'max_depth': 12... | -4.155120e+05 | -3.842305e+05 | ... | -4.132711e+05 | 20084.500258 | 6 | -1.899450e+05 | -1.916224e+05 | -1.874259e+05 | -1.827561e+05 | -1.914028e+05 | -1.886304e+05 | 3296.178354 |
| 11 | 0.112264 | 0.011466 | 0.003601 | 1.357950e-03 | squared_error | 12 | random | {'criterion': 'squared_error', 'max_depth': 12... | -5.405965e+05 | -5.209812e+05 | ... | -5.301322e+05 | 20519.620903 | 20 | -3.493139e+05 | -3.956588e+05 | -3.085430e+05 | -3.174347e+05 | -3.508746e+05 | -3.443650e+05 | 30680.210835 |
| 12 | 0.526038 | 0.073250 | 0.004599 | 1.738561e-03 | squared_error | 13 | best | {'criterion': 'squared_error', 'max_depth': 13... | -4.410598e+05 | -4.057989e+05 | ... | -4.328908e+05 | 16876.541152 | 11 | -1.514967e+05 | -1.504291e+05 | -1.482654e+05 | -1.436411e+05 | -1.515214e+05 | -1.490707e+05 | 2961.964779 |
| 13 | 0.163273 | 0.014669 | 0.007078 | 4.726531e-03 | squared_error | 13 | random | {'criterion': 'squared_error', 'max_depth': 13... | -4.495920e+05 | -4.833324e+05 | ... | -4.864953e+05 | 33715.996600 | 16 | -2.733390e+05 | -2.645216e+05 | -2.370438e+05 | -2.095200e+05 | -2.833594e+05 | -2.535568e+05 | 26876.312414 |
| 14 | 0.372112 | 0.067214 | 0.002001 | 1.813236e-06 | friedman_mse | 7 | best | {'criterion': 'friedman_mse', 'max_depth': 7, ... | -6.353755e+05 | -6.134599e+05 | ... | -6.580515e+05 | 40728.390457 | 28 | -5.989101e+05 | -6.009869e+05 | -5.902879e+05 | -6.246628e+05 | -6.287177e+05 | -6.087131e+05 | 15164.820579 |
| 15 | 0.124831 | 0.027591 | 0.001804 | 9.814748e-04 | friedman_mse | 7 | random | {'criterion': 'friedman_mse', 'max_depth': 7, ... | -1.042932e+06 | -1.515406e+06 | ... | -1.280741e+06 | 165953.895733 | 40 | -1.043233e+06 | -1.542838e+06 | -1.311143e+06 | -1.081811e+06 | -1.275237e+06 | -1.250853e+06 | 179535.841676 |
| 16 | 0.385050 | 0.078768 | 0.002601 | 4.903115e-04 | friedman_mse | 8 | best | {'criterion': 'friedman_mse', 'max_depth': 8, ... | -5.250198e+05 | -5.033210e+05 | ... | -5.336640e+05 | 21357.133435 | 22 | -4.635733e+05 | -4.604083e+05 | -4.573478e+05 | -4.600744e+05 | -4.643482e+05 | -4.611504e+05 | 2540.240458 |
| 17 | 0.162199 | 0.014176 | 0.002202 | 4.004968e-04 | friedman_mse | 8 | random | {'criterion': 'friedman_mse', 'max_depth': 8, ... | -8.613845e+05 | -1.143016e+06 | ... | -1.043785e+06 | 180991.168588 | 37 | -8.541742e+05 | -1.091291e+06 | -7.280620e+05 | -1.280387e+06 | -1.023361e+06 | -9.954550e+05 | 191106.773424 |
| 18 | 0.550628 | 0.058242 | 0.002599 | 4.880961e-04 | friedman_mse | 9 | best | {'criterion': 'friedman_mse', 'max_depth': 9, ... | -4.587310e+05 | -4.254228e+05 | ... | -4.540277e+05 | 16557.051955 | 13 | -3.596882e+05 | -3.608855e+05 | -3.583801e+05 | -3.513679e+05 | -3.636445e+05 | -3.587932e+05 | 4099.281421 |
| 19 | 0.130011 | 0.040389 | 0.009601 | 1.320000e-02 | friedman_mse | 9 | random | {'criterion': 'friedman_mse', 'max_depth': 9, ... | -8.468722e+05 | -9.054632e+05 | ... | -9.300438e+05 | 78985.320604 | 35 | -8.957059e+05 | -8.241726e+05 | -7.643624e+05 | -8.774860e+05 | -8.951914e+05 | -8.513837e+05 | 50750.472838 |
| 20 | 0.582724 | 0.076552 | 0.003201 | 4.015693e-04 | friedman_mse | 10 | best | {'criterion': 'friedman_mse', 'max_depth': 10,... | -4.236699e+05 | -3.850778e+05 | ... | -4.150092e+05 | 21487.378301 | 7 | -2.915339e+05 | -2.879010e+05 | -2.855552e+05 | -2.821124e+05 | -2.924004e+05 | -2.879006e+05 | 3806.661700 |
| 21 | 0.179986 | 0.009916 | 0.003195 | 7.428871e-04 | friedman_mse | 10 | random | {'criterion': 'friedman_mse', 'max_depth': 10,... | -8.149111e+05 | -5.296891e+05 | ... | -7.418730e+05 | 118427.874136 | 30 | -6.722672e+05 | -4.490996e+05 | -6.119588e+05 | -6.273953e+05 | -7.392641e+05 | -6.199970e+05 | 96210.293020 |
| 22 | 0.612639 | 0.099552 | 0.003201 | 3.998314e-04 | friedman_mse | 11 | best | {'criterion': 'friedman_mse', 'max_depth': 11,... | -4.062283e+05 | -3.847816e+05 | ... | -4.069451e+05 | 18232.185792 | 1 | -2.401105e+05 | -2.348048e+05 | -2.320764e+05 | -2.276072e+05 | -2.361896e+05 | -2.341577e+05 | 4176.694950 |
| 23 | 0.193624 | 0.056749 | 0.004400 | 3.322113e-03 | friedman_mse | 11 | random | {'criterion': 'friedman_mse', 'max_depth': 11,... | -7.041491e+05 | -6.139607e+05 | ... | -6.086282e+05 | 56655.047708 | 25 | -5.995078e+05 | -4.602772e+05 | -3.986709e+05 | -3.637453e+05 | -4.789362e+05 | -4.602275e+05 | 81051.878966 |
| 24 | 0.548043 | 0.043517 | 0.003198 | 4.020641e-04 | friedman_mse | 12 | best | {'criterion': 'friedman_mse', 'max_depth': 12,... | -4.149547e+05 | -3.842305e+05 | ... | -4.131596e+05 | 20073.296234 | 5 | -1.899447e+05 | -1.916224e+05 | -1.874259e+05 | -1.827561e+05 | -1.914028e+05 | -1.886304e+05 | 3296.159344 |
| 25 | 0.193837 | 0.032554 | 0.006395 | 4.845981e-03 | friedman_mse | 12 | random | {'criterion': 'friedman_mse', 'max_depth': 12,... | -5.405965e+05 | -5.209812e+05 | ... | -5.301322e+05 | 20519.620903 | 20 | -3.493139e+05 | -3.956588e+05 | -3.085430e+05 | -3.174347e+05 | -3.508746e+05 | -3.443650e+05 | 30680.210835 |
| 26 | 0.658242 | 0.037382 | 0.013402 | 1.930348e-02 | friedman_mse | 13 | best | {'criterion': 'friedman_mse', 'max_depth': 13,... | -4.410598e+05 | -4.057989e+05 | ... | -4.328908e+05 | 16876.541152 | 11 | -1.514967e+05 | -1.504291e+05 | -1.482654e+05 | -1.436411e+05 | -1.515214e+05 | -1.490707e+05 | 2961.964779 |
| 27 | 0.204617 | 0.057046 | 0.008609 | 6.655216e-03 | friedman_mse | 13 | random | {'criterion': 'friedman_mse', 'max_depth': 13,... | -4.495920e+05 | -4.833324e+05 | ... | -4.864953e+05 | 33715.996600 | 16 | -2.733390e+05 | -2.645216e+05 | -2.370438e+05 | -2.095200e+05 | -2.833594e+05 | -2.535568e+05 | 26876.312414 |
| 28 | 0.307243 | 0.059828 | 0.002602 | 8.018737e-04 | poisson | 7 | best | {'criterion': 'poisson', 'max_depth': 7, 'spli... | -7.096264e+05 | -7.395386e+05 | ... | -7.476331e+05 | 23950.885637 | 32 | -7.074233e+05 | -6.854771e+05 | -6.883518e+05 | -7.080239e+05 | -6.936158e+05 | -6.965784e+05 | 9468.949245 |
| 29 | 0.068809 | 0.006703 | 0.002204 | 3.966096e-04 | poisson | 7 | random | {'criterion': 'poisson', 'max_depth': 7, 'spli... | -9.755043e+05 | -1.212191e+06 | ... | -1.296491e+06 | 203151.380295 | 42 | -9.929619e+05 | -1.212443e+06 | -1.434235e+06 | -1.219159e+06 | -1.385235e+06 | -1.248807e+06 | 155340.921689 |
| 30 | 0.226773 | 0.031424 | 0.004599 | 3.200723e-03 | poisson | 8 | best | {'criterion': 'poisson', 'max_depth': 8, 'spli... | -5.645684e+05 | -5.603239e+05 | ... | -5.689961e+05 | 16733.783218 | 24 | -5.037730e+05 | -4.973795e+05 | -5.035480e+05 | -4.963966e+05 | -4.986461e+05 | -4.999486e+05 | 3114.324149 |
| 31 | 0.103635 | 0.025246 | 0.002607 | 4.946398e-04 | poisson | 8 | random | {'criterion': 'poisson', 'max_depth': 8, 'spli... | -1.311710e+06 | -1.278795e+06 | ... | -1.224950e+06 | 122600.688277 | 39 | -1.301882e+06 | -1.338387e+06 | -9.473921e+05 | -1.158349e+06 | -1.210267e+06 | -1.191255e+06 | 137695.075317 |
| 32 | 0.241054 | 0.024483 | 0.002997 | 6.315269e-04 | poisson | 9 | best | {'criterion': 'poisson', 'max_depth': 9, 'spli... | -4.738021e+05 | -4.611727e+05 | ... | -4.747810e+05 | 18511.064633 | 15 | -3.841544e+05 | -3.808842e+05 | -3.880645e+05 | -3.811331e+05 | -3.850104e+05 | -3.838493e+05 | 2660.007034 |
| 33 | 0.068441 | 0.005870 | 0.002600 | 4.908562e-04 | poisson | 9 | random | {'criterion': 'poisson', 'max_depth': 9, 'spli... | -1.044076e+06 | -5.924331e+05 | ... | -8.286140e+05 | 188548.524959 | 34 | -1.006847e+06 | -5.969701e+05 | -5.473150e+05 | -9.028315e+05 | -8.462557e+05 | -7.800439e+05 | 178089.529005 |
| 34 | 0.139797 | 0.017359 | 0.002201 | 7.497183e-04 | poisson | 10 | best | {'criterion': 'poisson', 'max_depth': 10, 'spl... | -4.444842e+05 | -4.076857e+05 | ... | -4.309716e+05 | 23247.320590 | 10 | -3.092662e+05 | -3.003146e+05 | -3.037569e+05 | -3.037668e+05 | -3.079349e+05 | -3.050079e+05 | 3219.723451 |
| 35 | 0.041806 | 0.000740 | 0.002200 | 4.007826e-04 | poisson | 10 | random | {'criterion': 'poisson', 'max_depth': 10, 'spl... | -8.353816e+05 | -6.439340e+05 | ... | -8.074732e+05 | 164095.823438 | 33 | -8.541640e+05 | -6.501399e+05 | -9.036673e+05 | -5.117828e+05 | -8.437318e+05 | -7.526972e+05 | 148278.484107 |
| 36 | 0.153998 | 0.008390 | 0.002600 | 4.891631e-04 | poisson | 11 | best | {'criterion': 'poisson', 'max_depth': 11, 'spl... | -4.186783e+05 | -3.945122e+05 | ... | -4.078375e+05 | 14341.060961 | 3 | -2.510950e+05 | -2.438683e+05 | -2.466376e+05 | -2.444399e+05 | -2.465483e+05 | -2.465178e+05 | 2541.807976 |
| 37 | 0.048198 | 0.000751 | 0.002200 | 3.983266e-04 | poisson | 11 | random | {'criterion': 'poisson', 'max_depth': 11, 'spl... | -7.015431e+05 | -7.192359e+05 | ... | -6.548511e+05 | 78156.443136 | 27 | -6.557390e+05 | -6.074547e+05 | -4.589817e+05 | -6.047912e+05 | -4.193488e+05 | -5.492631e+05 | 92558.330681 |
| 38 | 0.162838 | 0.003328 | 0.002809 | 4.040404e-04 | poisson | 12 | best | {'criterion': 'poisson', 'max_depth': 12, 'spl... | -4.170512e+05 | -4.000160e+05 | ... | -4.126896e+05 | 15580.489214 | 4 | -2.010111e+05 | -1.963072e+05 | -1.997041e+05 | -1.964447e+05 | -1.987733e+05 | -1.984481e+05 | 1835.728440 |
| 39 | 0.057399 | 0.008380 | 0.002200 | 3.991844e-04 | poisson | 12 | random | {'criterion': 'poisson', 'max_depth': 12, 'spl... | -5.341241e+05 | -4.378845e+05 | ... | -5.126957e+05 | 63032.933932 | 19 | -3.944072e+05 | -3.393747e+05 | -4.748407e+05 | -2.937125e+05 | -3.338988e+05 | -3.672468e+05 | 62626.468167 |
| 40 | 0.188418 | 0.014933 | 0.002401 | 4.903718e-04 | poisson | 13 | best | {'criterion': 'poisson', 'max_depth': 13, 'spl... | -4.257155e+05 | -4.035098e+05 | ... | -4.261527e+05 | 22571.530849 | 9 | -1.614691e+05 | -1.562810e+05 | -1.591754e+05 | -1.545017e+05 | -1.586777e+05 | -1.580210e+05 | 2411.072286 |
| 41 | 0.060129 | 0.000787 | 0.002001 | 2.231476e-06 | poisson | 13 | random | {'criterion': 'poisson', 'max_depth': 13, 'spl... | -4.376854e+05 | -4.648492e+05 | ... | -5.036855e+05 | 45328.699642 | 18 | -2.995263e+05 | -2.927464e+05 | -3.520271e+05 | -3.165022e+05 | -3.659355e+05 | -3.253475e+05 | 28868.657072 |
42 rows × 23 columns
dec_tree_test = DecisionTreeRegressor(criterion = 'squared_error', max_depth = 11, splitter = 'best', random_state=42)
dec_tree_test.fit(X_train, y_train)
# Prediction
ypred_test = dec_tree_test.predict(X_test)
# Checking Accuracy
print(r2_score(y_test, ypred_test))
0.9753397323035329
Die Scores der Modelle sind:
Random Forest:
Accuracy of model with Test data: 0.982
**Linear Regression:
Accuracy of model with Test data: 0.8931
Linear Regression Ridge:
Accuracy of model with Test data: 0.8928
Linear Regression Lasso:
Accuracy of model with Test data: 0.8918
Decisioan Tree Regression:
Accuracy of model with Test data: 0.975
Auf der Grundlage dieser Daten ist das beste Modell Decision Tree. Lineare Regressionsmodelle schneiden nicht sehr gut ab und das Random Forest Modell ist Overtfitted.